This exploration of the lyrics is to get acquainted with the data set and perform any transformations that will likely be useful in subsequent analysis.
Early Rush albums had fewer songs, and were produced every year or two. The last three albums were separated by about five years.
lyrics %>%
count(released, album) %>%
mutate(released = if_else(album == "Fly by Night", as.integer(1974), released)) %>%
ggplot(aes(x = released, y = n)) +
geom_col(fill = "lightgoldenrod") +
geom_text(aes(label = album, y = 0.5), angle = 90, hjust = "bottom", vjust = .25,
size = 3, color = "lightgoldenrod4") +
scale_x_continuous(limits = c(1970, 2015),
breaks = seq(1970, 2015, by = 5),
minor_breaks = 1970:2015,
expand = c(0,0)) +
theme_light() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5)
) +
labs(x = "Released", y = "Songs on Album",
title = glue("Neil Peart authored lyrics to {nrow(lyrics)} songs across {n_distinct(lyrics$album)} albums."),
subtitle = "Song Counts")lyrics_1 <- lyrics %>%
inner_join(lyrics_lines %>% count(song_id, name = "n_lines"), by = "song_id")
lc_quant <- quantile(lyrics_1$n_lines)The IQR of lines per song was 20 to 24 lines. The shortest song was Jacob’s Ladder (14 lines). The longest was Red Tide (38 lines).
p <- lyrics_1 %>%
ggplot(aes(x = released, y = n_lines, group = as.factor(released),
text = glue("{song} ({album})<br>",
"Lines: {n_lines}"))) +
geom_boxplot(color = "goldenrod4", fill = "lightgoldenrod") +
geom_jitter(color = "goldenrod", height = 0, size = 2, alpha = 0.6) +
theme_light() +
labs(
x = "Released", y = "Line Count",
title = glue("Songs usually vary between {lc_quant['25%']} and {lc_quant['75%']} lines.")
)
ggplotly(p, tooltip = "text")The tidytext package will split the text into “tokens” (words). Let’s count words.
word_count <- lyrics %>%
unnest_tokens(output = "word", input = "lyrics", token = "words") %>%
count(song_id, name = "n_words")
lyrics_2 <- lyrics_1 %>%
inner_join(word_count, by = "song_id")
wc_quant <- quantile(lyrics_2$n_words)The IQR of words per song was 110 to 140 words. The shortest song was Animate (69 words). The longest were The Big Wheel and The Weapon at (182 words).
p <- lyrics_2 %>%
ggplot(aes(x = released, y = n_words, group = as.factor(released),
text = glue("{song} ({album})<br>",
"Lines: {n_lines}<br>",
"Words: {n_words}"))) +
geom_boxplot(color = "goldenrod4", fill = "lightgoldenrod") +
geom_jitter(color = "goldenrod", height = 0, size = 2, alpha = 0.6) +
theme_light() +
labs(
x = "Released", y = "Word Count",
title = glue("Songs usually vary between {wc_quant['25%']} and {wc_quant['75%']} words.")
)
ggplotly(p, tooltip = "text")An easy to calculate measure measure of song length is the string length of the entire song. Let’s see the distribution.
lyrics_3 <- lyrics_2 %>%
mutate(n_chars = map_int(lyrics, str_length))
cc_quant <- quantile(lyrics_3$n_chars)The IQR of song lengths was 561 to 731 characters. The shortest songs were Animate and Earthshine, both at 397 characters. The longest song was Territories (946 characters).
p <- lyrics_3 %>%
ggplot(aes(x = released, y = n_chars, group = as.factor(released),
text = glue("{song} ({album})<br>",
"Chars: {n_chars}"))) +
geom_boxplot(color = "goldenrod4", fill = "lightgoldenrod") +
geom_jitter(color = "goldenrod", height = 0, size = 2, alpha = 0.6) +
theme_light() +
labs(
x = "Released", y = "Character Count",
title = glue("Songs usually vary between {cc_quant['25%']} and {cc_quant['75%']} characters.")
)
ggplotly(p, tooltip = "text")Save the lyrics with summary stats for subsequent steps.
saveRDS(lyrics_3, "./2_lyrics.Rds")